library(mosaicData)
library(tidyverse)
library(GGally)
package ‘GGally’ was built under R version 3.6.2Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
Load the diamonds.csv data set and undertake an initial exploration of the data. You will find a description of the meanings of the variables on the relevant Kaggle page
diamonds <- read.csv("/Users/user/codeclan_work/codeclan_homework_DebbieL/week_10/day_02/diamonds.csv")
glimpse(diamonds)
Rows: 53,940
Columns: 11
$ X [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23…
$ carat [3m[38;5;246m<dbl>[39m[23m 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.30, 0.23, 0.22, 0.3…
$ cut [3m[38;5;246m<fct>[39m[23m Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Very Good, Fair, Very …
$ color [3m[38;5;246m<fct>[39m[23m E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I, E, H, J, J, G, I, …
$ clarity [3m[38;5;246m<fct>[39m[23m SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, SI1, SI2, SI2, I1, …
$ depth [3m[38;5;246m<dbl>[39m[23m 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64.0, 62.8, 60.4, 62.…
$ table [3m[38;5;246m<dbl>[39m[23m 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58, 54, 54, 56, 59, 5…
$ price [3m[38;5;246m<int>[39m[23m 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 342, 344, 345, 345, 3…
$ x [3m[38;5;246m<dbl>[39m[23m 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.25, 3.93, 3.88, 4.3…
$ y [3m[38;5;246m<dbl>[39m[23m 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.28, 3.90, 3.84, 4.3…
$ z [3m[38;5;246m<dbl>[39m[23m 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.73, 2.46, 2.33, 2.7…
The price is in US Dollars, carat being weight of the diamond etc - Clarity- a measure of how clear the diamond is I1-(worst), IF(Best)
We expect the carat of the diamonds to be strong correlated with the physical dimensions x, y and z. Use ggpairs() to investigate correlations between these four variables
alias(lm(carat ~ ., data = diamonds))
Model :
carat ~ X + cut + color + clarity + depth + table + price + x +
y + z
ggpairs(diamonds)
CODECLAN SOLUTION
ggpairs(diamonds[,c("carat", "x", "y", "z")])
So, we do find significant correlations. Let’s drop columns x, y and z from the dataset, in preparation to use only carat going forward
diamonds_trim <- diamonds %>%
select(-c("x", "y", "z"))
diamonds_trim
CODECLAN SOLUTION
diamonds_tidy <- diamonds %>%
select(-c("x", "y", "z"))
diamonds_tidy
We are interested in developing a regression model for the price of a diamond in terms of the possible predictor variables in the dataset.
model <- lm(price ~ carat + cut + clarity + depth, data = diamonds_trim)
model
Call:
lm(formula = price ~ carat + cut + clarity + depth, data = diamonds_trim)
Coefficients:
(Intercept) carat cutGood cutIdeal cutPremium cutVery Good clarityIF
-5895.89 8472.28 629.00 963.06 813.37 812.98 5191.30
claritySI1 claritySI2 clarityVS1 clarityVS2 clarityVVS1 clarityVVS2 depth
3493.80 2656.34 4344.65 4124.41 4887.13 4877.04 -26.23
Use ggpairs() to investigate correlations between price and the predictors (this may take a while to run, don’t worry, make coffee or something).
ggpairs(diamonds_trim)
CODECLAN SOLUTION
ggpairs(diamonds_tidy)
carat is strongly correlated with price. Boxplots of price split by cut, color and particularly clarity also show some variation.
Perform further ggplot visualisations of any significant correlations you find.
diamonds_trim %>%
ggplot(aes(x = clarity, y = price)) +
geom_point()
alt_model <- lm(price ~ carat * cut, data = diamonds_trim)
ggPredict(alt_model, interactive = TRUE
Error: Incomplete expression: ggPredict(alt_model, interactive = TRUE
CODECLAN SOLUTION
diamonds_tidy %>%
ggplot(aes(x = carat, y = price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
diamonds_tidy %>%
ggplot(aes(x = cut, y = price)) +
geom_boxplot()
diamonds_tidy %>%
ggplot(aes(x = color, y = price)) +
geom_boxplot()
diamonds_tidy %>%
ggplot(aes(x = clarity, y = price)) +
geom_boxplot()
Shortly we may try a regression fit using one or more of the categorical predictors cut, clarity and color, so let’s investigate these predictors:
Investigate the factor levels of these predictors. How many dummy variables do you expect for each of them?
CODECLAN SOLUTION
unique(diamonds_tidy$cut)
[1] Ideal Premium Good Very Good Fair
Levels: Fair Good Ideal Premium Very Good
# expect 4 dummies for cut
unique(diamonds_tidy$color)
[1] E I J H F G D
Levels: D E F G H I J
# expect 6 dummies for color
unique(diamonds_tidy$clarity)
[1] SI2 SI1 VS1 VS2 VVS2 VVS1 I1 IF
Levels: I1 IF SI1 SI2 VS1 VS2 VVS1 VVS2
Use the dummy_cols() function in the fastDummies package to generate dummies for these predictors and check the number of dummies in each case.
library(fastDummies)
diamonds_dummy_cut <- diamonds_trim %>%
fastDummies::dummy_cols(select_columns = "cut", remove_first_dummy = TRUE)
diamonds_dummy_cut
CODECLAN SOLUTION
diamonds_dummies <- dummy_cols(diamonds, select_columns = c("cut", "clarity", "color"), remove_first_dummy = TRUE)
glimpse(diamonds_dummies)
Rows: 53,940
Columns: 28
$ X [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21…
$ carat [3m[38;5;246m<dbl>[39m[23m 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.30, 0.23, 0…
$ cut [3m[38;5;246m<fct>[39m[23m Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Very Good, Fai…
$ color [3m[38;5;246m<fct>[39m[23m E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I, E, H, J, J…
$ clarity [3m[38;5;246m<fct>[39m[23m SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, SI1, SI2, S…
$ depth [3m[38;5;246m<dbl>[39m[23m 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64.0, 62.8, 6…
$ table [3m[38;5;246m<dbl>[39m[23m 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58, 54, 54, 5…
$ price [3m[38;5;246m<int>[39m[23m 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 342, 344, 345…
$ x [3m[38;5;246m<dbl>[39m[23m 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.25, 3.93, 3…
$ y [3m[38;5;246m<dbl>[39m[23m 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.28, 3.90, 3…
$ z [3m[38;5;246m<dbl>[39m[23m 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.73, 2.46, 2…
$ cut_Good [3m[38;5;246m<int>[39m[23m 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0…
$ cut_Ideal [3m[38;5;246m<int>[39m[23m 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
$ cut_Premium [3m[38;5;246m<int>[39m[23m 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ `cut_Very Good` [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1…
$ clarity_IF [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ clarity_SI1 [3m[38;5;246m<int>[39m[23m 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1…
$ clarity_SI2 [3m[38;5;246m<int>[39m[23m 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0…
$ clarity_VS1 [3m[38;5;246m<int>[39m[23m 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
$ clarity_VS2 [3m[38;5;246m<int>[39m[23m 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
$ clarity_VVS1 [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ clarity_VVS2 [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ color_E [3m[38;5;246m<int>[39m[23m 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0…
$ color_F [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ color_G [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ color_H [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
$ color_I [3m[38;5;246m<int>[39m[23m 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0…
$ color_J [3m[38;5;246m<int>[39m[23m 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1…
diamonds_dummy_clarity <- diamonds_trim %>%
fastDummies::dummy_cols(select_columns = "clarity", remove_first_dummy = TRUE)
diamonds_dummy_clarity
NA
diamonds_dummy_color <- diamonds_trim %>%
fastDummies::dummy_cols(select_columns = "color", remove_first_dummy = TRUE)
diamonds_dummy_color
Going forward we’ll let R handle dummy variable creation for categorical predictors in regression fitting (remember lm() will generate the correct numbers of dummy levels automatically, absorbing one of the levels into the intercept as a reference level)
First, we’ll start with simple linear regression. Regress price on carat and check the regression diagnostics.
library(modelr)
library(ggfortify)
package ‘ggfortify’ was built under R version 3.6.2
model_2 <- lm(price ~ carat, data = diamonds_trim)
summary(model_2)
Call:
lm(formula = price ~ carat, data = diamonds_trim)
Residuals:
Min 1Q Median 3Q Max
-18585.3 -804.8 -18.9 537.4 12731.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2256.36 13.06 -172.8 <2e-16 ***
carat 7756.43 14.07 551.4 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1549 on 53938 degrees of freedom
Multiple R-squared: 0.8493, Adjusted R-squared: 0.8493
F-statistic: 3.041e+05 on 1 and 53938 DF, p-value: < 2.2e-16
CODECLAN SOLUTION
mod1 <- lm(price ~ carat, data = diamonds_tidy)
par(mfrow = c(2,2))
plot(mod1)
autoplot(model_2)
`arrange_()` is deprecated as of dplyr 0.7.0.
Please use `arrange()` instead.
See vignette('programming') for more help
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m
Run a regression with one or both of the predictor and response variables log() transformed and recheck the diagnostics. Do you see any improvement?
CODELCAN SOLUTION
mod2_logx <- lm(price ~ log(carat), data = diamonds_tidy)
par(mfrow = c(2,2))
plot(mod2_logx)
mod2_logy <- lm(log(price) ~ carat, data = diamonds_tidy)
par(mfrow = c(2,2))
plot(mod2_logy)
mod2_logxlogy <- lm(log(price) ~ log(carat), data = diamonds_tidy)
par(mfrow = c(2,2))
plot(mod2_logxlogy)
# it looks like log transformation of both variables is required to get close to satisfying the regression assumptions.
Let’s use log() transformations of both predictor and response. Next, experiment with adding a single categorical predictor into the model. Which categorical predictor is best? [Hint - investigate r2 values]
CODECLAN SOLUTION
mod3_cut <- lm(log(price) ~ log(carat) + cut, data = diamonds_tidy)
summary(mod3_cut)
Call:
lm(formula = log(price) ~ log(carat) + cut, data = diamonds_tidy)
Residuals:
Min 1Q Median 3Q Max
-1.52247 -0.16484 -0.00587 0.16087 1.38115
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.200125 0.006343 1292.69 <2e-16 ***
log(carat) 1.695771 0.001910 887.68 <2e-16 ***
cutGood 0.163245 0.007324 22.29 <2e-16 ***
cutIdeal 0.317212 0.006632 47.83 <2e-16 ***
cutPremium 0.238217 0.006716 35.47 <2e-16 ***
cutVery Good 0.240753 0.006779 35.52 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2545 on 53934 degrees of freedom
Multiple R-squared: 0.9371, Adjusted R-squared: 0.9371
F-statistic: 1.607e+05 on 5 and 53934 DF, p-value: < 2.2e-16
mod3_color <- lm(log(price) ~ log(carat) + color, data = diamonds_tidy)
summary(mod3_color)
Call:
lm(formula = log(price) ~ log(carat) + color, data = diamonds_tidy)
Residuals:
Min 1Q Median 3Q Max
-1.49987 -0.14888 0.00257 0.15316 1.27815
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.572034 0.003051 2809.531 < 2e-16 ***
log(carat) 1.728631 0.001814 952.727 < 2e-16 ***
colorE -0.025460 0.003748 -6.793 1.11e-11 ***
colorF -0.034455 0.003773 -9.132 < 2e-16 ***
colorG -0.055399 0.003653 -15.166 < 2e-16 ***
colorH -0.189859 0.003917 -48.468 < 2e-16 ***
colorI -0.286928 0.004383 -65.467 < 2e-16 ***
colorJ -0.425038 0.005417 -78.466 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2372 on 53932 degrees of freedom
Multiple R-squared: 0.9454, Adjusted R-squared: 0.9454
F-statistic: 1.333e+05 on 7 and 53932 DF, p-value: < 2.2e-16
mod3_clarity <- lm(log(price) ~ log(carat) + clarity, data = diamonds_tidy)
summary(mod3_clarity)
Call:
lm(formula = log(price) ~ log(carat) + clarity, data = diamonds_tidy)
Residuals:
Min 1Q Median 3Q Max
-0.97521 -0.12085 0.01048 0.12561 1.85854
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.768115 0.006940 1119.25 <2e-16 ***
log(carat) 1.806424 0.001514 1193.23 <2e-16 ***
clarityIF 1.114625 0.008376 133.07 <2e-16 ***
claritySI1 0.624558 0.007163 87.19 <2e-16 ***
claritySI2 0.479658 0.007217 66.46 <2e-16 ***
clarityVS1 0.820461 0.007306 112.30 <2e-16 ***
clarityVS2 0.775248 0.007197 107.72 <2e-16 ***
clarityVVS1 1.028298 0.007745 132.77 <2e-16 ***
clarityVVS2 0.979221 0.007529 130.05 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1888 on 53931 degrees of freedom
Multiple R-squared: 0.9654, Adjusted R-squared: 0.9654
F-statistic: 1.879e+05 on 8 and 53931 DF, p-value: < 2.2e-16
[Try this question if you have looked at the material on transformations] Interpret the fitted coefficients for the levels of your chosen categorical predictor. Which level is the reference level? Which level shows the greatest difference in price from the reference level? [Hints - remember we are regressing the log(price) here, and think about what the presence of the log(carat) predictor implies. We’re not expecting a mathematical explanation]
COEDLCNA SOLUTOIN
# 'I1' is the reference level for clarity. In multiple linear regression, the interpretation of any
# coefficient is the change in response due to that predictor with other predictors held constant
# log(price) makes the relationship geometric. Clarity = 'IF' shows the most difference from the reference level.
# Here's how to interpret the `clarityIF` coefficient in the presence of the `log(price)` response
ratio <- exp(1.114625)
ratio
[1] 3.048425
# so, on average, the price of an IF diamond will be approx. 3 times higher than that of I1 diamond of same carat
EXTENSION
Try adding an interaction between log(carat) and your chosen categorical predictor. Do you think this interaction term is statistically justified?
CODECLAN SOLUTION
mod4_clarity_inter <- lm(log(price) ~ log(carat) + clarity + log(carat):clarity, data = diamonds_tidy)
summary(mod4_clarity_inter)
Call:
lm(formula = log(price) ~ log(carat) + clarity + log(carat):clarity,
data = diamonds_tidy)
Residuals:
Min 1Q Median 3Q Max
-0.92773 -0.12104 0.01212 0.12465 1.51830
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.808102 0.007223 1080.98 <2e-16 ***
log(carat) 1.528106 0.014944 102.25 <2e-16 ***
clarityIF 1.122732 0.011381 98.65 <2e-16 ***
claritySI1 0.587556 0.007465 78.71 <2e-16 ***
claritySI2 0.438797 0.007486 58.62 <2e-16 ***
clarityVS1 0.790989 0.007721 102.45 <2e-16 ***
clarityVS2 0.723109 0.007530 96.03 <2e-16 ***
clarityVVS1 1.007591 0.009506 106.00 <2e-16 ***
clarityVVS2 0.968426 0.008359 115.85 <2e-16 ***
log(carat):clarityIF 0.337116 0.017593 19.16 <2e-16 ***
log(carat):claritySI1 0.288214 0.015254 18.89 <2e-16 ***
log(carat):claritySI2 0.258795 0.015437 16.76 <2e-16 ***
log(carat):clarityVS1 0.300307 0.015395 19.51 <2e-16 ***
log(carat):clarityVS2 0.250187 0.015237 16.42 <2e-16 ***
log(carat):clarityVVS1 0.301955 0.016317 18.51 <2e-16 ***
log(carat):clarityVVS2 0.321665 0.015717 20.47 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1877 on 53924 degrees of freedom
Multiple R-squared: 0.9658, Adjusted R-squared: 0.9658
F-statistic: 1.014e+05 on 15 and 53924 DF, p-value: < 2.2e-16
# obtain only a small improvement in r^2 from the interaction.
# we'll see shortly that the correct test would be an anova() comparing both models
anova(mod3_clarity, mod4_clarity_inter)
Analysis of Variance Table
Model 1: log(price) ~ log(carat) + clarity
Model 2: log(price) ~ log(carat) + clarity + log(carat):clarity
Res.Df RSS Df Sum of Sq F Pr(>F)
1 53931 1923.1
2 53924 1900.6 7 22.558 91.433 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Find and plot an appropriate visualisation to show the effect of this interaction
CODECLAN SOLUTION
diamonds_tidy %>%
ggplot(aes(x = log(carat), y = log(price), colour = clarity)) +
geom_point(alpha = 0.1) +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~ clarity)
# not much evidence that the gradient of the line varies significantly with clarity
EXPLANATION- RESIDUAL STANDARD ERROR AND R SQUARED IS MORE IMPORTANT
FOR PREDICTION- INTERESTED IN RESIDUAL STANDARD ERROR ALONE